cor2cov = function(R, std) {
Sigma = outer(std, std) * R
return(Sigma)
}
get_upper_tri = function(cormat){
cormat[lower.tri(cormat)] = NA
diag(cormat) = NA
return(cormat)
}
p_dens = function(p_mat) {
p_vec = p_mat[upper.tri(p_mat)]
df_line = data.frame(xintercept = c(0.01, 0.05),
Lines = c("p = 0.01", "p = 0.05"),
stringsAsFactors = FALSE)
fig_p = data.frame(p = p_vec) %>%
ggplot(aes(x = p)) +
stat_ecdf(geom = "point") +
geom_vline(aes(xintercept = xintercept, color = Lines), df_line,
linetype = "dashed", size = 1) +
labs(x = "P-value", y = "Cumulative density") +
scale_color_discrete(name = NULL) +
theme_bw()
return(fig_p)
}
p_filter = function(mat, mat_p, max_p){
ind_p = mat_p
ind_p[mat_p > max_p] = 0
ind_p[mat_p <= max_p] = 1
mat_filter = mat * ind_p
return(mat_filter)
}
hard_thresh = function(R, th){
R_th = R
R_th[abs(R) <= th] = 0
return(R_th)
}
1. Data generation
1.1 Absolute abundances
set.seed(123)
n = 50
d = 100
mu = c(rep(NA, 4), # Taxa with correlations
10000, 10000, # High abundant taxa
sample(c(200, 50), d - 6, replace = TRUE, prob = c(0.8, 0.2))) # Low abundant taxa
# NB distribution
dispersion = 0.5
# Absolute abundances
A = matrix(NA, ncol = n, nrow = d)
for (i in seq.int(from = 5, to = d)) {
A[i, ] = rnbinom(n = n, size = 1/dispersion, mu = mu[i])
}
# Dependent pairs of taxa
abn_mean = c(2000, 2000)
template_var = log((abn_mean + dispersion * abn_mean^2)/abn_mean^2 + 1)
template_mean = log(abn_mean) - template_var/2
Sigma = cor2cov(R = diag(1, nrow = 2), std = sqrt(template_var))
template = t(MASS::mvrnorm(n = n, mu = template_mean, Sigma = Sigma))
template1 = template[1, ]
template2 = template[2, ]
poly1 = template_mean[1] * poly(x = template1, degree = 1, raw = FALSE)[, 1] + template_mean[1]
poly2 = template_mean[2] * poly(x = template2, degree = 2, raw = FALSE)[, 2] + template_mean[2]
A_dep = round(exp(rbind(template, poly1, poly2)))
for (i in 1:4) {
A[i, ] = A_dep[i, ]
}
mu[1:2] = 2000
mu[3:4] = 10000
taxa_id = paste0("T", seq_len(d))
sample_id = paste0("S", seq_len(n))
rownames(A) = taxa_id
colnames(A) = sample_id
p_a = data.frame(t(log(A[1:5, ]))) %>%
ggpairs(title = "Synthetic Log Absolute Abundances",
progress = FALSE,
upper = NULL, diag = NULL) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
strip.background = element_rect(fill = "white"))
print(p_a)

1.2 Observed abundances
# Sequencing efficiency
C = rbeta(n = d, shape1 = 5, shape2 = 5)
# Microbial loads in the ecosystem
A_prim = A * C
A_dot = colSums(A_prim)
# Relative abundances in the ecosystem
R = A_prim/t(replicate(d, A_dot))
# Sampling fractions
S = rbeta(n = n, shape1 = 2, shape2 = 10)
# Library sizes
O_dot = round(S * A_dot)
# Observed abundances
O = matrix(NA, nrow = d, ncol = n)
for (i in seq(n)) {
O[, i] = rmultinom(1, size = O_dot[i], prob = R[, i])
}
rownames(O) = taxa_id
colnames(O) = sample_id
# Random errors
E = O/(A * C * t(replicate(d, S)))
# Log scale parameters
a = log(A)
c = log(C)
s = log(S)
e = log(E)
# Log scale variables
o = log(O)
r = O/t(replicate(d, O_dot))
p_o = data.frame(t(log(O[1:5, ]))) %>%
ggpairs(title = "Synthetic Log Observed Abundances",
progress = FALSE, upper = NULL, diag = NULL) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
strip.background = element_rect(fill = "white"))
print(p_o)

1.3 Example: taxon 1 vs. taxon 2
# Absolute abundance
A_t12 = data.frame(t(A[1:2, ]))
ls_cor_test = cor.test(x = A_t12$T1, y = A_t12$T2, method = "pearson")
df_ann = data.frame(x = 4000, y = 4000) %>%
mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
"\n", "p = ", round(ls_cor_test$p.value, 2)))
p_A_t12 = A_t12 %>%
ggplot(aes(x = T1, y = T2)) +
geom_point(size = 4, alpha = 0.6) +
labs(title = "Absolute Abundances for T1 and T2") +
geom_label(data = df_ann, aes(x = x, y = y, label = label),
size = 6, vjust = -0.5, hjust = 0, color = "black") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
# Observed abundance
O_t12 = data.frame(t(O[1:2, ]))
ls_cor_test = cor.test(x = O_t12$T1, y = O_t12$T2, method = "pearson")
df_ann = data.frame(x = 600, y = 710) %>%
mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
"\n", "p = ", round(ls_cor_test$p.value, 2)))
p_O_t12 = O_t12 %>%
ggplot(aes(x = T1, y = T2)) +
geom_point(size = 4, alpha = 0.6) +
labs(title = "Observed Abundances for T1 and T2") +
geom_label(data = df_ann, aes(x = x, y = y, label = label),
size = 6, vjust = -0.5, hjust = 0, color = "black") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
ggarrange(p_A_t12, p_O_t12, ncol = 2)

2. Standard Pearson correlation
2.1 Correlation matrix
o[is.infinite(o)] = NA
res_sample = Hmisc::rcorr(t(o), type = "pearson")
corr_sample = res_sample$r
corr_sample_p = res_sample$P
diag(corr_sample_p) = 0
corr_sample_fl = p_filter(corr_sample, corr_sample_p, max_p = 0.005)
df = corr_sample_fl[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_sample = df_p %>%
ggplot(aes(var1, var2, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, position = "top") +
scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Standard Pearson") +
theme_minimal() +
theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.7, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_sample

2.2 P-value
p_dense_sample = p_dens(corr_sample_p)
p_dense_sample = p_dense_sample +
labs(title = "Standard Pearson") +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p_dense_sample)
3. SparCC
corr_sparcc = sparcc(t(O))$Cor
colnames(corr_sparcc) = rownames(O)
rownames(corr_sparcc) = rownames(O)
corr_sparcc_th = hard_thresh(corr_sparcc, th = 0.3)
df = corr_sparcc_th[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_sparcc = df_p %>%
ggplot(aes(var1, var2, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, position = "top") +
scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SparCC") +
theme_minimal() +
theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.7, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_sparcc

4. SECOM
OTU = otu_table(O, taxa_are_rows = TRUE)
meta_data = data.frame(sample_id = sample_id)
META = sample_data(meta_data)
sample_names(META) = meta_data$sample_id
otu_data = phyloseq(OTU, META)
pseqs = list(c(otu_data, otu_data))
pseudo = 0; zero_cut = 0.5; corr_cut = 0.5; lib_cut = 1000
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; seed = 123; thresh_hard = 0.3; max_p = 0.005; n_cl = 5
res_linear = secom_linear(pseqs, pseudo, zero_cut, corr_cut, lib_cut,
wins_quant, method, soft, thresh_len, n_cv,
seed, thresh_hard, max_p, n_cl)
R = 1000; max_p = 0.005
res_dist = secom_dist(pseqs, pseudo, zero_cut, corr_cut, lib_cut,
wins_quant, R, seed, max_p, n_cl)
4.1 Bias-corrected abundance estimation
y_hat = res_linear$y_hat
p_y_hat = data.frame(t(y_hat[1:5, ])) %>%
ggpairs(title = "Bias-Corrected Abundances",
progress = FALSE, upper = NULL, diag = NULL) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
strip.background = element_rect(fill = "white"))
print(p_y_hat)

4.2 Linear correlation
4.21 Thresholding
row_ind = which(rownames(res_linear$corr_th) %in% paste0("T", 1:5))
col_ind = which(colnames(res_linear$corr_th) %in% paste0("T", 1:5))
df = res_linear$corr_th[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_secom1 = df_p %>%
ggplot(aes(var1, var2, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, position = "top") +
scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SECOM (Pearson1)") +
theme_minimal() +
theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.7, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_secom1

4.22 Filtering
df = res_linear$corr_fl[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_secom2 = df_p %>%
ggplot(aes(var1, var2, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, position = "top") +
scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SECOM (Pearson2)") +
theme_minimal() +
theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.7, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_secom2

4.23 P-value
p_dense_secom1 = p_dens(res_linear$corr_p)
p_dense_secom1 = p_dense_secom1 +
labs(title = "SECOM (Pearson2)") +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p_dense_secom1)
4.3 Distance correlation
4.31 Filtering
df = res_dist$dcorr_fl[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_secom3 = df_p %>%
ggplot(aes(var1, var2, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, position = "top") +
scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SECOM (Distance)") +
theme_minimal() +
theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.7, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_secom3

4.32 P-value
p_dense_secom2 = p_dens(res_dist$dcorr_p)
p_dense_secom2 = p_dense_secom2 +
labs(title = "SECOM (Distance)") +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p_dense_secom2)
5. Outputs
p_main1 = ggarrange(ggmatrix_gtable(p_a), ggmatrix_gtable(p_o),
ncol = 2, nrow = 1, labels = c("a", "b"))
p_main2 = ggarrange(p_sample, p_sparcc, p_secom3, ncol = 3,
labels = c("c", "d", "e"), common.legend = TRUE,
legend = "bottom")
p_main = ggarrange(p_main1, p_main2, ncol = 1, nrow = 2)
ggsave(plot = p_main, "../images/main/illustrate.pdf", height = 12, width = 12)
ggsave(plot = p_main, "../images/main/illustrate.jpeg", height = 12, width = 12, dpi = 300)
df_p = data.frame(p = corr_sample_p[upper.tri(corr_sample_p)], method = "Pearson") %>%
bind_rows(
data.frame(p = res_dist$dcorr_p[upper.tri(res_dist$dcorr_p)], method = "SECOM (Distance)")
)
df_line = data.frame(xintercept = c(0.01, 0.05),
stringsAsFactors = FALSE)
p_supp = df_p %>%
ggplot(aes(x = p, color = method)) +
scale_color_npg(name = NULL) +
stat_ecdf(geom = "point") +
geom_vline(xintercept = 0.005,
linetype = "dashed", size = 1) +
labs(x = "P-value", y = "Cumulative density") +
scale_x_continuous(breaks = c(0.005, 0.25, 0.50, 0.75, 1.00)) +
theme_bw()
p_supp

ggsave(plot = p_supp, "../images/supp/supp_p_value.pdf", height = 5, width = 6.25)
ggsave(plot = p_supp, "../images/supp/supp_p_value.jpeg", height = 5, width = 6.25, dpi = 300)